This is a not fully edited version of our analysis on the Washington Post’s fatal police shooting dataset. We used data from mid-April, so this analysis does not account for more recent events. Everything is not in perfect order either, but that will eventually be fixed. Moslty created this markdown file because the code from our separate parts of the analysis is pretty incomprehensible in it’s current form, and takes some editing to run.
df.fatal <- read.csv("data-police-shootings-master/fatal-police-shootings-data.csv")
head(df.fatal)
## id name date manner_of_death armed age gender
## 1 3 Tim Elliot 2015-01-02 shot gun 53 M
## 2 4 Lewis Lee Lembke 2015-01-02 shot gun 47 M
## 3 5 John Paul Quintero 2015-01-03 shot and Tasered unarmed 23 M
## 4 8 Matthew Hoffman 2015-01-04 shot toy weapon 32 M
## 5 9 Michael Rodriguez 2015-01-04 shot nail gun 39 M
## 6 11 Kenneth Joe Brown 2015-01-04 shot gun 18 M
## race city state signs_of_mental_illness threat_level
## 1 A Shelton WA True attack
## 2 W Aloha OR False attack
## 3 H Wichita KS False other
## 4 W San Francisco CA True attack
## 5 H Evans CO False attack
## 6 W Guthrie OK False attack
## flee body_camera
## 1 Not fleeing False
## 2 Not fleeing False
## 3 Not fleeing False
## 4 Not fleeing False
## 5 Not fleeing False
## 6 Not fleeing False
Get lonlat data and frequency for map
df.fatal <- unite(data=df.fatal, city.state, c(city, state), sep = " ", remove = FALSE) #create variable city.state for better accuracy
df.loc <- as.data.frame(table(df.fatal$city.state)) #get freq
names(df.loc)[1] <- 'city.state'
lonlat <- geocode(as.character(df.loc$city.state), source = 'dsk') #get latitude and longitude
df.loc <- na.omit(cbind(df.loc, lonlat)) #remove NA
#saveRDS(df.loc, "~/processedData/df.loc.RDS") #save df.loc if use google maps since it takes so long
Plot using white US map without Alaska or Hawaii
US <- map_data("state") #get US map data, white map
ggplot(data=US, aes(x=long, y=lat, group=group)) +
geom_polygon(fill="white", colour="black") +
xlim(-160, 60) + ylim(25,75) +
geom_point(data=df.loc, inherit.aes=F, aes(x=lon, y=lat, size=Freq), colour="blue", alpha=.8) +
coord_cartesian(xlim = c(-130, -50), ylim=c(20,55))
Plot using google map devtools::install_github(“hadley/ggplot2@v2.2.0”) need old version of ggplot2 to use google maps
df.loc <- readRDS("processedData/df.loc.RDS") #to load
df.loc$city.state <- as.character(df.loc$city.state)
Separate noncontiguous states
df.loc$city.state <- as.character(df.loc$city.state) #change city.state to character to use grep
hawaii <- df.loc[grepl("HI$",df.loc$city.state),]
alaska <- df.loc[grepl("AK$",df.loc$city.state),]
US contiguous
map <- get_map(location=c(lon = -96.35, lat = 39.70), zoom = 4, source="google",crop=TRUE)
ggmap(map, legend = "none") +
geom_point(aes(x = lon, y = lat, size=Freq), data = df.loc, alpha = .7, color = "darkblue") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank())
Alaska
map <- get_map(location = "alaska", zoom = 4)
ggmap(map, legend = "none") +
geom_point(aes(x = lon, y = lat, size=Freq), data = alaska, alpha = .7, color = "darkblue") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank())
Hawaii
map <- get_map(location = "hawaii", zoom = 7)
ggmap(map, legend = "none") +
geom_point(aes(x = lon, y = lat, size=Freq), data = hawaii, alpha = .7, color = "darkblue") +
theme(axis.title=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank())
Create df with lat and lon
lonlat <- geocode(as.character(df.fatal$city.state), source = 'dsk')
df.location <- cbind(df.fatal, latlon)
#saveRDS(df.location, "processedData/df.location.RDS")
Use this data for df.fatal
df.fatal <- readRDS("processedData/df.location.RDS")
df.na <- df.fatal[rowSums(is.na(df.fatal)) > 0,] #see rows with missing values
df.fatal <- na.omit(df.fatal) #few NA, so just won't use them
Create new variable minority
df.fatal$minority <- 'white'
df.fatal$minority[df.fatal$race =='B'] <- 'black'
df.fatal$minority[df.fatal$race =='H'] <- 'hispanic'
df.fatal$minority[df.fatal$race !='B' & df.fatal$race != 'W' & df.fatal$race != 'H'] <- 'other'
df.fatal$minority <- factor(df.fatal$minority)
Create distance matrix for visualization, use Gower distance since mostly categorical data
drop <- c('name', 'date', 'city.state', 'city', 'state', 'race')
df.fatal.clean <- df.fatal[ , !(names(df.fatal) %in% drop)]
gower_dist <- daisy(df.fatal.clean[, -1], metric = "gower")
Check Gower dist works
gower_mat <- as.matrix(gower_dist)
Output most similar pair
df.fatal.clean[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]),
arr.ind = TRUE)[1, ], ]
## id manner_of_death armed age gender signs_of_mental_illness
## 2015 2238 shot gun 33 M False
## 1914 2134 shot gun 33 M False
## threat_level flee body_camera lon lat minority
## 2015 other Not fleeing False -118.2612 33.81332 hispanic
## 1914 other Not fleeing False -118.2484 33.97395 hispanic
Output most dissimilar pair
df.fatal.clean[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]),
arr.ind = TRUE)[1, ], ]
## id manner_of_death armed age gender signs_of_mental_illness
## 2124 2372 shot knife 30 F True
## 1637 1829 shot and Tasered gun 28 M False
## threat_level flee body_camera lon lat minority
## 2124 other Car False -97.77126 30.32637 black
## 1637 attack Foot True -149.33601 67.09454 other
Calculate silhouette width for many k using PAM
sil_width <- c(NA)
for(i in 2:10){
pam_fit <- pam(gower_dist,
diss = TRUE,
k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
Plot sihouette width (higher is better)
plot(1:10, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
Looks like K = 2 is best
pam_fit <- pam(gower_dist, diss = TRUE, k = 2)
pam_results <- df.fatal.clean %>% dplyr::select(-id) %>%
mutate(cluster = pam_fit$clustering) %>%
group_by(cluster) %>%
do(the_summary = summary(.))
Look at numerics of clusters
pam_results$the_summary
## [[1]]
## manner_of_death armed age gender
## shot :1440 gun :1122 Min. :14.0 : 0
## shot and Tasered: 70 toy weapon : 73 1st Qu.:27.0 F: 55
## unarmed : 72 Median :34.0 M:1455
## vehicle : 63 Mean :36.9
## knife : 53 3rd Qu.:45.0
## undetermined: 53 Max. :91.0
## (Other) : 74
## signs_of_mental_illness threat_level flee
## False:1171 attack :1352 : 26
## True : 339 other : 74 Car : 217
## undetermined: 84 Foot : 192
## Not fleeing:1009
## Other : 66
##
##
## body_camera lon lat minority cluster
## False:1367 Min. :-168.02 Min. :19.56 black :384 Min. :1
## True : 143 1st Qu.:-112.07 1st Qu.:33.52 hispanic:265 1st Qu.:1
## Median : -95.31 Median :36.07 other :108 Median :1
## Mean : -97.86 Mean :36.72 white :753 Mean :1
## 3rd Qu.: -83.92 3rd Qu.:39.91 3rd Qu.:1
## Max. : -68.10 Max. :71.29 Max. :1
##
##
## [[2]]
## manner_of_death armed age gender
## shot :586 knife :272 Min. : 6.00 : 0
## shot and Tasered: 84 unarmed : 88 1st Qu.:26.00 F: 37
## vehicle : 83 Median :34.00 M:633
## gun : 67 Mean :35.64
## undetermined: 44 3rd Qu.:43.00
## toy weapon : 22 Max. :83.00
## (Other) : 94
## signs_of_mental_illness threat_level flee body_camera
## False:469 attack : 51 : 19 False:575
## True :201 other :574 Car :110 True : 95
## undetermined: 45 Foot : 65
## Not fleeing:460
## Other : 16
##
##
## lon lat minority cluster
## Min. :-157.93 Min. :19.56 black :169 Min. :2
## 1st Qu.:-111.92 1st Qu.:33.53 hispanic:112 1st Qu.:2
## Median : -88.10 Median :36.84 other : 63 Median :2
## Mean : -95.00 Mean :36.74 white :326 Mean :2
## 3rd Qu.: -81.47 3rd Qu.:40.10 3rd Qu.:2
## Max. : -68.01 Max. :61.58 Max. :2
##
Looks to be clustered by threat level, race, and armed Look at medoids
df.fatal.clean[pam_fit$medoids, ]
## id manner_of_death armed age gender signs_of_mental_illness
## 395 498 shot gun 35 M False
## 2100 2344 shot knife 33 M False
## threat_level flee body_camera lon lat minority
## 395 attack Not fleeing False -95.97522 35.62283 white
## 2100 other Not fleeing False -89.57461 37.33529 white
tSNE, t-distributed stochastic neighborhood embedding
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y"))
Lets look at some variables in 2D to see if anything separates well
tsne_data <- data.frame(cluster = factor(pam_fit$clustering), df.fatal.clean, tsne_data, race=df.fatal$race)
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=threat_level))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=manner_of_death))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=signs_of_mental_illness))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=body_camera))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=minority))
ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color=race))
Can compare with MDS MDS
fit <- cmdscale(gower_dist, k=2) #k is the number of dim
#fit #view results
Plot MDS without coloring groups
fit <- as.data.frame(fit)
ggplot() + geom_point(data=fit, aes(fit[,1], fit[,2]))
Plot MDS, color by cluster
fit <- cbind(fit, df.fatal, cluster = factor(pam_fit$clustering))
ggplot() + geom_point(data=fit, aes(x=V1, y=V2, color=cluster))
fat.dat <- readRDS("processedData/df.location.RDS")
eliminate the rows with missing entries
miss.dat <- fat.dat[rowSums(is.na(fat.dat)) == 0,]
miss.dat <- miss.dat[miss.dat$id != c(2158),]
miss.dat <- miss.dat[miss.dat$id != c(2304),]
discretize weapons
weapon_converter <- function(x){
if(x %in% c('gun', 'guns and explosives', 'gun and knife', 'hatchet and gun',
'machete and gun')) return(as.factor('gun'))
if(x %in% c('knife','pole and knife','sword','machete')) return(as.factor('knife'))
if(x %in% c('vehicle','motorcycle'))return(as.factor('vehicle'))
if(x %in% c('','undetermined')) return(as.factor('undetermined'))
if(x %in% c('toy weapon')) return(as.factor('toy weapon'))
if(x == 'unarmed') return(as.factor('unarmed'))
return(as.factor('other'))
}
weap.dat <- sapply(miss.dat$armed, weapon_converter)
use.dat <- miss.dat[,-c(1:3,9:11)]
use.dat[,2] <- weap.dat
Gower distance Euclidean distance doesn’t make sense for categorical data
gower.dist <- daisy(use.dat, metric = "gower")
try three dimensions
gower.mds3 <- cmdscale(gower.dist, k = 3, eig = TRUE)
gmds.res3 <- data.frame(gower.mds3$points)
scatter3D(x = gmds.res3$X1, y = gmds.res3$X2, z = gmds.res3$X3, colvar = as.numeric(use.dat$race),
axis.ticks = T, ticktype = "detailed",pch = 19, cex = 0.5, bty = "g", theta = 85, phi = 15)
try two dimensions
gower.mds <- cmdscale(gower.dist, k = 2, eig = TRUE)
gmds.res <- data.frame(gower.mds$points)
plain MDS plot
ggplot(data = gmds.res, aes(x = X1, y = X2)) + geom_point(cex = 0.5) + labs(title = "Classical MDS")
colored by mental illness
ggplot(data = gmds.res, aes(x = X1, y = X2)) +
geom_point(aes(color = use.dat$signs_of_mental_illness), cex = 0.5) +
labs(title = "Classical MDS: Mental Illness", color = "Mental Illness") +
guides(colour = guide_legend(override.aes = list(size=2)))
colored by threat level
ggplot(data = gmds.res, aes(x = X1, y = X2)) +
geom_point(aes(color = use.dat$threat_level), cex = 0.5) +
scale_color_hue(labels = c("Attack", "Other","Undet.")) +
labs(title = "Classical MDS: Threat Level", color = "Threat Level") +
guides(colour = guide_legend(override.aes = list(size=2)))
colored by race
ggplot(data = gmds.res, aes(x = X1, y = X2)) + geom_point(aes(color = use.dat$race), cex = 0.5) +
labs(title = "Classical MDS: Race", color = "Race") +
scale_color_hue(labels = c("Undet.", "Asian","Black", "Hispanic","Nat. Am.", "Other race","White")) +
guides(colour = guide_legend(override.aes = list(size=2)))
colored by body camera
ggplot(data = gmds.res, aes(x = X1, y = X2)) +
geom_point(aes(color = use.dat$body_camera), cex = 0.5) +
labs(title = "Classical MDS: Body Camera", color = "Body Camera") +
guides(colour = guide_legend(override.aes = list(size=2)))
race.counts <- table(use.dat$race)
race.prop <- prop.table(race.counts)*100
rownames(race.prop) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
race.prop <- data.frame(race.prop)
ggplot(data = race.prop, aes(x = Var1, y = Freq)) + geom_bar(aes(fill = Var1), stat = "identity") +
labs(title = "Racial Breakdown: Fatal Police Shootings", y = "Percent", x = "") +
theme(axis.text=element_text(size=12)) + guides(fill = FALSE) + ylim(c(0,100))
us.race <- c(0, 4.7, 12.2, 16.3, 0.9, 2.1, 63.7)
us.race <- data.frame(us.race,race.prop[,1])
ggplot(data = us.race, aes(x = race.prop[,1], y = us.race)) +
geom_bar(aes(fill = race.prop[,1]),stat = "identity") +
labs(title = "Racial Breakdown: U.S. Population", y = "Percent", x = "") +
theme(axis.text=element_text(size=12)) + guides(fill = FALSE) + ylim(c(0,100))
rw.tab <- table(use.dat$race, use.dat$armed)
rownames(rw.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
rw.dat <- data.frame(prop.table(rw.tab,2)*100)
colnames(rw.dat)[1] <- "Race"
ggplot(data = rw.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
ra.tab <- table(use.dat$race, use.dat$threat_level)
rownames(ra.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
ra.dat <- data.frame(prop.table(ra.tab,2)*100)
colnames(ra.dat)[1] <- "Race"
colnames(ra.dat)[2] <- "Threat"
ggplot(data = ra.dat, aes(x = Race, y = Freq)) + facet_wrap(~Threat) +
geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
ra.dat2 <- data.frame(prop.table(ra.tab,1)*100)
colnames(ra.dat2)[1] <- "Race"
colnames(ra.dat2)[2] <- "Threat"
ggplot(data = ra.dat2, aes(x = Threat, y = Freq)) + facet_wrap(~Race) +
geom_bar(aes(fill = Threat), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
rmi.tab <- table(use.dat$race, use.dat$signs_of_mental_illness)
rownames(rmi.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rmi.tab) <- factor(c("No Signs of Mental Illness", "Signs of Mental Illness"))
rmi.dat <- data.frame(prop.table(rmi.tab,2)*100)
colnames(rmi.dat)[1] <- "Race"
ggplot(data = rmi.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
rbc.tab <- table(use.dat$race, use.dat$body_camera)
rownames(rbc.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rbc.tab) <- factor(c("No Body Camera", "Body Camera"))
rbc.dat <- data.frame(prop.table(rbc.tab,2)*100)
colnames(rbc.dat)[1] <- "Race"
ggplot(data = rbc.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
rf.tab <- table(use.dat$race, use.dat$flee)
rownames(rf.tab) <- factor(c("Undet.", "Asian", "Black", "Hispanic", "Nat. Am.", "Other race", "White"))
colnames(rf.tab)[1] <- "Undetermined"
rf.dat <- data.frame(prop.table(rf.tab,2)*100)
colnames(rf.dat)[1] <- "Race"
ggplot(data = rf.dat, aes(x = Race, y = Freq)) + facet_wrap(~Var2) +
geom_bar(aes(fill = Race), stat = "identity") + ylim(c(0,100)) + labs(y = "Percent", x = "") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(strip.text = element_text(size = 11))
## Kidus
binary_decision <- function(vec){
#converts (0,0.5) to 0 and (0.5,1) to 1
return(ifelse(vec < 0.5, 0, 1))
}
load in shared data with Laura, Zoe and Ed
master_data = readRDS("processedData/df.location.RDS")
Clean up the data to remove duplicate entries.
master_data = master_data[complete.cases(master_data),]
master_data = master_data[-which(master_data$id == 2304),]
master_data = master_data[-which(master_data$id == 2158),]
There were many different kinds of weapons that were only mentioned once, and others that were combined categories, so we chose to simplify. We created new categories “gun”, “knife”, “vehicle”, “undetermined”, “toy weapon”, “unarmed”, and “other”
weapon_converter <- function(x){
# converts weapon categories
if(x %in% c('gun', 'guns and explosives', 'gun and knife', 'hatchet and gun',
'machete and gun')) return(as.factor('gun'))
if(x %in% c('knife','pole and knife','sword','machete')) return(as.factor('knife'))
if(x %in% c('vehicle','motorcycle'))return(as.factor('vehicle'))
if(x %in% c('','undetermined')) return(as.factor('undetermined'))
if(x %in% c('toy weapon')) return(as.factor('toy weapon'))
if(x == 'unarmed') return(as.factor('unarmed'))
return(as.factor('other'))
}
# add weapon category column
weapon_cat = sapply(master_data$armed, function(x) weapon_converter(x))
master_data = cbind(master_data, weapon_cat)
race_converter <- function(x){
# to factor
if(x == 'B') return(as.factor('B'))
if(x == 'W') return(as.factor('W'))
return(as.factor('O'))
}
# add race category column
race_cat = sapply(master_data$race, function(x) race_converter(x))
master_data = cbind(master_data, race_cat)
We were interested in any regional effects, so we subdivided the state into ‘Northeast’, ‘Midwest’, ‘South’, ‘West’.
region_converter <- function(x){
# Create new factor "region"
if(x == 'CT'| x == 'ME'| x == 'MA'| x == 'NH'| x == 'RI'|
x == 'VT'| x == 'NJ'| x == 'NY'| x == 'PA') return(as.factor(1))
if(x == 'IL'| x == 'IN'| x == 'MI'| x == 'OH'| x == 'WI'|
x == 'IA'| x == 'KS'| x == 'MN'| x == 'MO'| x == 'NE'|
x == 'SD'| x == 'ND') return(as.factor(2))
if(x == 'DE'| x == 'FL'| x == 'GA'| x == 'MD'| x == 'NC'|
x == 'SC'| x == 'VA'| x == 'DC'| x == 'WV'| x == 'AL'|
x == 'KY'| x == 'MS'| x == 'TN'| x == 'AR'| x == 'LA'|
x == 'OK'| x == 'TX') return(as.factor(3))
if(x == 'AZ'| x == 'CO'| x == 'ID'| x == 'MT'| x == 'NV'|
x == 'NM'| x == 'UT'| x == 'WY'| x == 'AK'| x == 'CA'|
x == 'HI'| x == 'OR'| x == 'WA') return(as.factor(4))
}
region_defn = c('Northeast','Midwest','South','West')
region = sapply(master_data$state, function(x) region_converter(as.character(x)))
master_data = cbind(master_data, region)
#subset the data
drops1 = c("state","city","city.state","name","id","region","race", "date","armed")
fatal_data1 = master_data[,!(names(master_data) %in% drops1)]
drops2 = c("state","city","city.state","name","id","race","region","date","manner_of_death","armed","gender","flee","body_camera","weapon_cat")
fatal_data2 = master_data[,!(names(master_data) %in% drops2)]
drops3 = c("state","city","city.state","name","id","lat","lon","race", "date","armed")
fatal_data3 = master_data[,!(names(master_data) %in% drops3)]
drops4 = c("state","city","city.state","name","id","race", "date","armed")
fatal_data4 = master_data[,!(names(master_data) %in% drops4)]
drops5 = c("state","city","city.state","name","id","race","region","date","manner_of_death","armed","flee","body_camera")
fatal_data5 = master_data[,!(names(master_data) %in% drops5)]
Split up our data into train and test set
smp_size <- floor(0.75 * nrow(fatal_data1)) #75% of the dataset
set the seed to make the partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data1)), size = smp_size)
train <- fatal_data1[train_ind, ]
test <- fatal_data1[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]
Train Gaussian SVM on full training set.
cost = 95
gamma = 0.01
g.svm <- svm(race_cat ~., data = train,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 205 91 52
## W 197 653 205
## B 25 59 146
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6148194
0.6148194 Test Gaussian SVM on full test set
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 62 31 18
## W 51 218 79
## B 8 27 51
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6073394
0.6073394 POLY SVM Polynomial kernel Train Poly SVM on full training set
cost = 10
gamma = 0.01
degree = 3
coef0 = 2
g.svm <- svm(race_cat ~., data = train,
type = 'C-classification', kernel = 'polynomial',
cost = cost, gamma = gamma, coef0 = coef0, degree=degree)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 206 95 53
## W 198 648 212
## B 23 60 138
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6074709
0.6074709 Test Poly SVM on full test set
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 59 30 16
## W 55 215 82
## B 7 31 50
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5944954
0.5944954 Linear kernel Train Gaussian SVM on full training set
cost = 100
g.svm <- svm(race_cat ~., data = train,
type = 'C-classification', kernel = 'linear',
cost = cost)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 204 112 52
## W 195 601 204
## B 28 90 147
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.5829761
0.5829761 Test Gaussian SVM on full test set
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 61 32 18
## W 51 210 77
## B 9 34 53
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5944954
0.5944954 Train Multinomial Logistic Regression on full training set
m.log.r <- multinom(race_cat~., data=train)
## # weights: 66 (42 variable)
## initial value 1794.033867
## iter 10 value 1580.801489
## iter 20 value 1480.560485
## iter 30 value 1477.198958
## iter 40 value 1476.880296
## final value 1476.879822
## converged
m.log.r.pred.train <- predict(m.log.r,newdata = X.train)
table(m.log.r.pred.train, Y.train)
## Y.train
## m.log.r.pred.train O W B
## O 184 103 44
## W 202 594 201
## B 41 106 158
print(sum(m.log.r.pred.train == Y.train)/length(Y.train))
## [1] 0.5731782
0.5731782 Test Multinomial Logistic Regression on full test set
m.log.r.pred <- predict(m.log.r, newdata = X.test)
table(m.log.r.pred, Y.test)
## Y.test
## m.log.r.pred O W B
## O 53 26 13
## W 56 213 77
## B 12 37 58
print(sum(m.log.r.pred == Y.test)/length(Y.test))
## [1] 0.5944954
0.5944954
reg.data <- fatal_data4[fatal_data4$region == 1,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 250
gamma = 0.01
g.svm <- svm(race_cat ~. -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 5 1 0
## W 7 48 8
## B 9 9 36
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7235772
0.7235772 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 0 2 0
## W 3 12 5
## B 4 4 11
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5609756
0.5609756
reg.data <- fatal_data4[fatal_data4$region == 2,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 220
gamma = 0.1
g.svm <- svm(race_cat ~. -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 26 0 0
## W 4 152 5
## B 2 5 77
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.9409594
0.9409594 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 0 4 3
## W 4 35 15
## B 1 7 22
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6263736
0.6263736 #### REGION 3 - SOUTH
reg.data <- fatal_data4[fatal_data4$region == 3,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~. -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 66 3 4
## W 33 349 55
## B 4 15 145
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8308605
0.8308605 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 15 8 6
## W 19 84 36
## B 4 19 34
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5911111
0.5911111 #### REGION 4 - WEST
reg.data <- fatal_data4[fatal_data4$region == 4,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~. -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 215 35 17
## W 38 210 14
## B 2 0 33
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8120567
0.8120567 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 53 26 14
## W 27 46 10
## B 7 5 1
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5291005
0.5291005 ### FEATURE SELECTION set the seed to make the partition reproducible EXCLUDE REGION FOR THIS PART
smp_size <- floor(0.75 * nrow(fatal_data1)) #'75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data1)), size = smp_size)
train <- fatal_data1[train_ind, ]
test <- fatal_data1[-train_ind, ]
Y.train <- train$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
Y.test <- test$race_cat
X.test <- test[,-which(names(test)=='race_cat')]
X.train <- train[,-which(names(train)==‘race_cat’|names(train)==‘signs_of_mental_illness’|names(train)==‘threat_level’)]
blep <- function(x){
if(x == 'B') return(as.factor(1))
if(x == 'W') return(as.factor(2))
if(x == 'O') return(as.factor(3))
}
Y.train <- sapply(Y.train, blep)
define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
run the RFE algorithm
results <- rfe(X.train, Y.train, sizes=c(1:8), rfeControl=control)
summarize the results
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.5230 0.2462 0.02906 0.05262
## 2 0.5462 0.2605 0.04939 0.07722
## 3 0.5708 0.2954 0.02759 0.04567
## 4 0.5690 0.2946 0.04643 0.07543
## 5 0.5713 0.2939 0.03917 0.06454
## 6 0.5848 0.3130 0.02501 0.04204
## 7 0.5897 0.3165 0.02767 0.04469 *
## 8 0.5891 0.3132 0.02924 0.04623
## 10 0.5799 0.3053 0.03477 0.05571
##
## The top 5 variables (out of 7):
## lon, age, lat, signs_of_mental_illness, weapon_cat
list the chosen features
predictors(results)
## [1] "lon" "age"
## [3] "lat" "signs_of_mental_illness"
## [5] "weapon_cat" "threat_level"
## [7] "gender"
plot the results
plot(results, type=c("g", "o"))
in light of the above let us only use lon, age, lat, mental illness, threat, gender, weapon_cat
set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data5)), size = smp_size)
train <- fatal_data5[train_ind, ]
test <- fatal_data5[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]
RFE TRAIN
cost = 230
gamma = 0.02
g.svm <- svm(race_cat ~., data = train,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 209 82 50
## W 195 662 203
## B 23 59 150
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.6252296
0.6252296 RFE TEST
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 57 28 16
## W 58 222 83
## B 6 26 49
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6018349
0.6018349 ### LAT - LONG INTERACTION TERM
set.seed(123)
train_ind <- sample(seq_len(nrow(fatal_data5)), size = smp_size)
train <- fatal_data5[train_ind, ]
test <- fatal_data5[-train_ind, ]
Y.train <- train$race_cat
Y.test <- test$race_cat
X.train <- train[,-which(names(train)=='race_cat')]
X.test <- test[,-which(names(test)=='race_cat')]
RFE TRAIN
cost = 230
gamma = 0.02
g.svm <- svm(race_cat ~. + lat*lon, data = train,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 208 75 47
## W 194 664 198
## B 25 64 158
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.630741
0.679078 RFE TEST
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 60 26 17
## W 55 223 76
## B 6 27 55
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6201835
0.5731103 ### COMBINE RFE WITH REGION STRATIFICATION
to_drop <- c("manner_of_death","flee","body_camera")
all.reg.data <- fatal_data4[,!(names(fatal_data4) %in% to_drop)]
REGION 1 - NORTHEAST
reg.data <- all.reg.data[all.reg.data$region == 1,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 250
gamma = 0.01
g.svm <- svm(race_cat ~ . -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 5 1 0
## W 8 50 10
## B 8 7 34
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7235772
0.7235772 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 0 1 0
## W 3 15 6
## B 4 2 10
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.6097561
0.6097561
reg.data <- all.reg.data[all.reg.data$region == 2,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 220
gamma = 0.1
g.svm <- svm(race_cat ~ . -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 21 1 0
## W 6 145 9
## B 5 11 73
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.8819188
0.8819188 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 0 6 1
## W 4 31 21
## B 1 9 18
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5384615
0.5384615 #### REGION 3 - SOUTH
reg.data <- all.reg.data[all.reg.data$region == 3,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~ . -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 42 3 3
## W 54 349 77
## B 7 15 124
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.764095
0.764095 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 14 6 3
## W 19 87 48
## B 5 18 25
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.56
0.56 ### REGION 4 - WEST
reg.data <- all.reg.data[all.reg.data$region == 4,]
split up our data into train and test set
smp_size <- floor(0.75 * nrow(reg.data)) #75% of the dataset
set.seed(123)
train_ind <- sample(seq_len(nrow(reg.data)), size = smp_size)
train.reg <- reg.data[train_ind,]
test.reg <- reg.data[-train_ind,]
Y.train <- train.reg$race_cat
Y.test <- test.reg$race_cat
X.train <- train.reg[,-which(names(train.reg)=='race_cat')]
X.test <- test.reg[,-which(names(test.reg)=='race_cat')]
Train Gaussian SVM on region
cost = 240
gamma = 0.05
g.svm <- svm(race_cat ~ . -region, data = train.reg,
type = 'C-classification', kernel = 'radial',
cost = cost, gamma = gamma)
g.svm.pred.train <- predict(g.svm,newdata = X.train)
table(g.svm.pred.train, Y.train)
## Y.train
## g.svm.pred.train O W B
## O 208 57 34
## W 46 187 14
## B 1 1 16
print(sum(g.svm.pred.train == Y.train)/length(Y.train))
## [1] 0.7287234
0.7287234 Test Gaussian SVM on region
g.svm.pred <- predict(g.svm,newdata = X.test)
table(g.svm.pred, Y.test)
## Y.test
## g.svm.pred O W B
## O 61 32 17
## W 24 45 6
## B 2 0 2
print(sum(g.svm.pred == Y.test)/length(Y.test))
## [1] 0.5714286
0.5714286
ps.data = readRDS("processedData/df.location.RDS")
set.seed(414)
### Functions
loss = function(y, f.hat, c){
#Loss Function
a = sum(y == "True" & f.hat == "False")
b = sum(y == "False" & f.hat == "True")
out = a*c + b
return(out)
}
rf = function(X.train, Y.train, X.test, Y.test, thres, c = 9) {
#Performs the Random Forest. Returns the confusion matrix,
#the class error rates, and the value of the loss function
rf = randomForest(X.train, Y.train)
pred.rf = predict(rf, X.test, type = "prob")
t.num = which(colnames(pred.rf) == "True")
pred.table = unname(ifelse(pred.rf[,t.num] > thres, "True", "False"))
pred.table = cbind(pred.table,as.character(Y.test))
out = table(pred.table[,1],pred.table[,2])
false.err = 1-sum(pred.table[,1] == "False" & pred.table[,2] == "False")/sum(pred.table[,2] == "False")
true.err = 1-sum(pred.table[,1] == "True" & pred.table[,2] == "True")/sum(pred.table[,2] == "True")
loss = loss(y = Y.test, f.hat = pred.table[,1], c)
return(list(out,false.err,true.err,loss))
}
folds = function(n, k){
#Creates k folds. Used for the CV function
size = round(n/k)
out = list()
start = 0
indices = sample(1:n,n,replace = FALSE)
for(i in 1:k) {
if(i < k) out[[i]] = indices[(start+1):(start+size)]
else out[[i]] = indices[(start+1):n]
start = start+size
}
return(out)
}
up = function(X.train, Y.train){
#Up Sample the minority class
bcam.t = which(Y.train == "True")
bcam.f = which(Y.train == "False")
up.t = sample(bcam.t,length(bcam.f),replace = TRUE)
up.f = sample(bcam.f,length(bcam.f),replace = FALSE)
Y.up = Y.train[c(up.t,up.f)]
X.up = X.train[c(up.t,up.f),]
return(list(X.up,Y.up))
}
down = function(X.train, Y.train){
#Down Sample the majority class
bcam.t = which(Y.train == "True")
bcam.f = which(Y.train == "False")
down.t = sample(bcam.t,length(bcam.t),replace = FALSE)
down.f = sample(bcam.f,length(bcam.t),replace = FALSE)
Y.down = Y.train[c(down.t,down.f)]
X.down = X.train[c(down.t,down.f),]
return(list(X.down,Y.down))
}
cv.rf = function(X, Y, thres, c = 9, k = 5, r = 1,
sample = c("none","up","down")){
#Perform k-fold cross validation r times. Returns the loss
#along with the error rate for the "true" class
loss = 0
true.err = 0
for(j in 1:r) {
n = length(Y)
sets = folds(n, k)
for(i in 1:k) {
X.train = X[unlist(sets[-i]),]
Y.train = Y[unlist(sets[-i])]
X.test = X[sets[[i]],]
Y.test = Y[sets[[i]]]
if(sample == "up") {
temp = up(X.train,Y.train)
X.train = temp[[1]]
Y.train = temp[[2]]
}
if(sample == "down") {
temp = down(X.train,Y.train)
X.train = temp[[1]]
Y.train = temp[[2]]
}
results = rf(X.train, Y.train, X.test, Y.test, thres, c)
loss = loss + results[[4]]
true.err = true.err + results[[3]]
}
}
return(list(loss/(k*r),(true.err)/(k*r)))
}
### Data Cleaning
Y = as.factor(ps.data$body_camera)
X = as.data.frame(ps.data[,c(4:14,16,17)])
Y = Y[-c(1935,2050)]
X = X[-c(1935,2050),]
#omit missing data
Y = Y[complete.cases(X)]
X = X[complete.cases(X),]
#subset "Armed"
X$armed = as.character(X$armed)
gun = grep("gun", X$armed)
X$armed[gun] = "gun"
other = which(X$armed != "gun" &
X$armed != "unarmed" &
X$armed != "undetermined" &
X$armed != "knife" &
X$armed != "vehicle" &
X$armed != "toy weapon")
X$armed[other] = "other"
X$armed = as.factor(X$armed)
#remove city/state variables
X = X[-c(6:8)]
#create a training/test set
testset = function(X, Y, size = 500) {
draws = sample(0:length(Y),length(Y),replace = FALSE)
test = draws[1:size]
train = draws[(size+1):length(Y)]
Y.test = Y[test]
X.test = X[test,]
Y.train = Y[train]
X.train = X[train,]
return(list(X.train,Y.train,X.test,Y.test))
}
test = testset(X, Y, 500)
X.train = test[[1]]
Y.train = test[[2]]
X.test = test[[3]]
Y.test = test[[4]]
rf.train = randomForest(X.train,Y.train)
construct the oversampled training set
bcam.t = which(Y.train == "True")
bcam.f = which(Y.train == "False")
up.t = sample(bcam.t,length(bcam.f),replace = TRUE)
up.f = sample(bcam.f,length(bcam.f),replace = FALSE)
Y.up = Y.train[c(up.t,up.f)]
X.up = X.train[c(up.t,up.f),]
fit random forest and calculate cv error
rf.up = randomForest(X.up,Y.up)
cv.rf(X.train,Y.train,.5,sample = "up")
## [[1]]
## [1] 314.8
##
## [[2]]
## [1] 0.9291268
same as above for undersampled training set
down.t = sample(bcam.t,length(bcam.t),replace = FALSE)
down.f = sample(bcam.f,length(bcam.t),replace = FALSE)
Y.down = Y.train[c(down.t,down.f)]
X.down = X.train[c(down.t,down.f),]
rf.down = randomForest(X.down,Y.down)
cv.rf(X.train,Y.train,.5,sample = "down")
## [[1]]
## [1] 275
##
## [[2]]
## [1] 0.3993785
For regular training set, search for optimal k
thresholds = c(0.001, 0.01, 0.1, .25, .5)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "none")[[1]]))
}
## [1] 0.001 298.280
## [1] 0.01 287.76
## [1] 0.10 268.92
## [1] 0.25 308.48
## [1] 0.50 326.36
thresholds = 0.1+0.01*(0:10)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "none")[[1]]))
}
## [1] 0.10 270.88
## [1] 0.11 278.80
## [1] 0.12 274.92
## [1] 0.13 274.52
## [1] 0.14 278.52
## [1] 0.15 284.68
## [1] 0.16 282.32
## [1] 0.17 288.64
## [1] 0.18 291.12
## [1] 0.19 290.68
## [1] 0.20 296.48
rf(X.train,Y.train,X.test,Y.test,.10)
## [[1]]
##
## False True
## False 255 22
## True 189 34
##
## [[2]]
## [1] 0.4256757
##
## [[3]]
## [1] 0.3928571
##
## [[4]]
## [1] 387
rf(X.train,Y.train,X.train,Y.train,.1)
## [[1]]
##
## False True
## False 1388 0
## True 107 182
##
## [[2]]
## [1] 0.07157191
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 107
For oversampled training set, search for optimal k
thresholds = c(0.001, 0.01, 0.1, .25, .5)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9,sample = "up")[[1]]))
}
## [1] 0.001 298.800
## [1] 0.01 293.16
## [1] 0.10 273.48
## [1] 0.25 275.32
## [1] 0.5 316.6
thresholds = 0.1+0.01*(0:10)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "up")[[1]]))
}
## [1] 0.10 271.28
## [1] 0.11 268.68
## [1] 0.12 266.40
## [1] 0.13 268.84
## [1] 0.14 270.08
## [1] 0.15 261.64
## [1] 0.16 267.60
## [1] 0.17 264.08
## [1] 0.18 270.16
## [1] 0.19 273.80
## [1] 0.20 274.88
rf(X.up,Y.up,X.test,Y.test,.17)
## [[1]]
##
## False True
## False 257 23
## True 187 33
##
## [[2]]
## [1] 0.4211712
##
## [[3]]
## [1] 0.4107143
##
## [[4]]
## [1] 394
rf(X.up,Y.up,X.up,Y.up,.17)
## [[1]]
##
## False True
## False 1315 0
## True 180 1495
##
## [[2]]
## [1] 0.1204013
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 180
For undersampled training set, search for optimal k
thresholds = c(0.001, 0.01, 0.1, .25, .5)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "down")[[1]]))
}
## [1] 0.001 299.000
## [1] 0.01 299.00
## [1] 0.10 298.08
## [1] 0.25 288.00
## [1] 0.5 276.2
thresholds = 0.4+0.01*(0:10)
for(x in thresholds){
print(c(x,cv.rf(X.train,Y.train,x, r = 5, c = 9, sample = "down")[[1]]))
}
## [1] 0.4 269.0
## [1] 0.41 265.40
## [1] 0.42 261.20
## [1] 0.43 271.68
## [1] 0.44 267.80
## [1] 0.45 265.76
## [1] 0.46 270.52
## [1] 0.47 268.52
## [1] 0.48 278.04
## [1] 0.49 266.96
## [1] 0.50 278.04
rf(X.down,Y.down,X.down,Y.down,.5)
## [[1]]
##
## False True
## False 182 0
## True 0 182
##
## [[2]]
## [1] 0
##
## [[3]]
## [1] 0
##
## [[4]]
## [1] 0